def cleanbody(s):
    ret = ''
    numword = 0
    for word in s.split(' '):
        if word == '': continue
        if len(word) > 2 and word[0] == "'" and word[-1] == "'":
            word = '" '+word[1:-1]+' "'
        ret += ' '+word
        numword += 1
    if not numword == 0: ret = ret[1:]
    return [numword,ret]

def data_integration():
    import datetime
    starttime = 1385874000
    linkid2data = {}
    fr = open('../raw/data-suicidewatch-originalposts.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        if len(arr) == 6:
            linkid = 't3_'+arr[0]
            title = arr[1]
            author = arr[2]
            [numword,body] = cleanbody(arr[3])
            if numword < 2: continue
            posttime = int(float(arr[4]))
            if posttime < starttime: continue
            score = int(arr[5])
            linkid2data[linkid] = [[],title,author,body,posttime,score]
    fr.close()
    numlinkid = len(linkid2data)
    numcomment = 0
    fr = open('../raw/data-suicidewatch-comments.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        author = arr[2]
        [numword,body] = cleanbody(arr[3])
        if numword < 2: continue        
        posttime = int(arr[4])
        if posttime < starttime: continue        
        linkid = arr[5]
        postid = arr[6]
        parentid = arr[7]
        score = int(arr[9])
        if not linkid in linkid2data: continue
        linkid2data[linkid][0].append([author,body,posttime,score,postid,parentid])
        numcomment += 1
    fr.close()
    fw = open('../data/data.tsv','w')
    fw.write('NUM_LINK'+'\t'+str(numlinkid)+'\t'+'NUM_COMMENT'+'\t'+str(numcomment)+'\n')
    fw.write('LINKID\tROOT\tAUTHOR\tBODY\tTIME\tTIMESTAMP\tSCORE\tTITLE\tPOSTID\tPARENTID\n')
    for [linkid,[comments,title,author,body,posttime,score]] in sorted(linkid2data.items(),key=lambda x:x[1][4]):
        stamp = datetime.datetime.fromtimestamp(posttime)
        s = linkid+'\t'+'Y'+'\t'+author+'\t'+body+'\t'+str(posttime)+'\t'+str(stamp)+'\t'+str(score)+'\t'+title+'\t'+''+'\t'+''
        fw.write(s+'\n')
        for [_author,_body,_posttime,_score,_postid,_parentid] in sorted(comments,key=lambda x:x[2]):
            stamp = datetime.datetime.fromtimestamp(_posttime)        
            s = linkid+'\t'+'N'+'\t'+_author+'\t'+_body+'\t'+str(_posttime)+'\t'+str(stamp)+'\t'+str(_score)+'\t'+''+'\t'+_postid+'\t'+_parentid
            fw.write(s+'\n')
    fw.close()

def data_bipartite():
    linkid2bipartite = {}
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    lineno = 0
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        linkid = arr[0]
        author = arr[2]
        score = int(arr[6])
        if arr[1] == 'Y':
            linkid2bipartite[linkid] = [lineno,author,[]]
        else:
            if not (linkid2bipartite[linkid][1][0] == '[' or author[0] == '[' or author == linkid2bipartite[linkid][1]):
                linkid2bipartite[linkid][2].append([lineno,score])
        lineno += 1
    fr.close()
    score2freq = {}
    fw = open('../data/bipartite.csv','w')
    for [linkid,[lineno,author,lineno_score]] in sorted(linkid2bipartite.items(),key=lambda x:x[1][0]):
        for [_lineno,score] in lineno_score:
            fw.write(str(lineno)+','+str(_lineno)+','+str(score)+'\n')
            if not score in score2freq:
                score2freq[score] = 0
            score2freq[score] += 1
    fw.close()
    fw = open('../data/bipartite-score2freq.txt','w')
    fw.write('SCORE\tFREQ\n')
    for [score,freq] in sorted(score2freq.items(),key=lambda x:x[0]):
        fw.write(str(score)+'\t'+str(freq)+'\n')
    fw.close()

def data_timeliness():
    import numpy as np
    linkid2stat = {}
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        linkid = arr[0]
        posttime = int(arr[4])
        if arr[1] == 'Y':
            linkid2stat[linkid] = [posttime,[]]
        else:
            linkid2stat[linkid][1].append(posttime)
    fr.close()
    numlinkid = len(linkid2stat)

    numcomment2freq = {}
    onemin2freq = {}
    fivemin2freq = {}    
    for [linkid,[posttime,commenttimes]] in linkid2stat.items():
        numcomment = len(commenttimes)
        onemin,fivemin = -1,-1
        if numcomment > 0:
            onemin = int(np.ceil(1.*(commenttimes[0]-posttime)/60))
            fivemin = int(np.ceil(1.*(commenttimes[0]-posttime)/60/5))         
        if not numcomment in numcomment2freq:
            numcomment2freq[numcomment] = 0
        numcomment2freq[numcomment] += 1
        if not onemin in onemin2freq:
            onemin2freq[onemin] = 0
        onemin2freq[onemin] += 1
        if not fivemin in fivemin2freq:
            fivemin2freq[fivemin] = 0
        fivemin2freq[fivemin] += 1
    fw = open('../data/timeliness-numcomment2freq.txt','w')
    fw.write('NUM_COMMENT\tFREQ_LINK\tPCT_LINK\n')
    for [numcomment,freq] in sorted(numcomment2freq.items(),key=lambda x:x[0]):
        pct = np.round(100.*freq/numlinkid,2)
        fw.write(str(numcomment)+'\t'+str(freq)+'\t'+str(pct)+'\n')
    fw.close()
    fw = open('../data/timeliness-interval2freq-1min.txt','w')
    fw.write('MINUTES_FIRST_COMMENT\tFREQ_LINK\tPCT_LINK\n')
    for [onemin,freq] in sorted(onemin2freq.items(),key=lambda x:x[0]):
        pct = np.round(100.*freq/numlinkid,2)
        fw.write(str(onemin)+'\t'+str(freq)+'\t'+str(pct)+'\n')
    fw.close()
    fw = open('../data/timeliness-interval2freq-5min.txt','w')
    fw.write('MINUTES_FIRST_COMMENT\tFREQ_LINK\tPCT_LINK\n')
    for [fivemin,freq] in sorted(fivemin2freq.items(),key=lambda x:x[0]):
        pct = np.round(100.*freq/numlinkid,2)
        fw.write(str(fivemin*5)+'\t'+str(freq)+'\t'+str(pct)+'\n')
    fw.close()

    ### Time elapsed from original post to comment

    XS_WITHIN_MINUTES = [1,2,3,5,10,15,20,30,45,60,90,120]
    fw = open('../data/timeliness-elapsed.txt','w')
    fw.write('WITHIN_X_MINUTES\tNUM_POSTS\tPCT_POSTS (%)\tNUM_COMMENTS\tAVG_NUM_COMMENTS_PER_POST\n')
    for X_WITHIN_MINUTES in XS_WITHIN_MINUTES:
        lstnumcomments = []
        for [linkid,[posttime,commenttimes]] in linkid2stat.items():
            n = 0
            for commenttime in commenttimes:
                if commenttime-posttime <= X_WITHIN_MINUTES*60:
                    n += 1
            if n > 0:
                lstnumcomments.append(n)
        numposts = len(lstnumcomments)
        pctposts = np.round(100.*numposts/numlinkid,2)
        numcomments = sum(lstnumcomments)
        avgnumcomments = np.round(1.*numcomments/numposts,2)
        fw.write(str(X_WITHIN_MINUTES)+'\t'+str(numposts)+'\t'+str(pctposts)+'\t'+str(numcomments)+'\t'+str(avgnumcomments)+'\n')
    fw.close()

def data_collection():
    month2counts = {} # thread, comment
    uid2counts = {} # thread, comment, all
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        root = arr[1]
        uid = arr[2]
        month = arr[5][:7]
        if not month in month2counts:
            month2counts[month] = [0,0]
        if not uid in uid2counts:
            uid2counts[uid] = [0,0,0]
        if root == 'Y':
            month2counts[month][0] += 1
            uid2counts[uid][0] += 1
        if root == 'N':
            month2counts[month][1] += 1
            uid2counts[uid][1] += 1
        uid2counts[uid][2] += 1
    fr.close()
    fw = open('../data/stat-collection.txt','w')
    fw.write('Month'+'\t'+'NumPosts'+'\t'+'NumComments'+'\n')
    totalcounts = [0,0]
    for [month,counts] in sorted(month2counts.items(),key=lambda x:x[0]):
        totalcounts[0] += counts[0]
        totalcounts[1] += counts[1]
        fw.write(month+'\t'+str(counts[0])+'\t'+str(counts[1])+'\n')
    fw.write('Total'+'\t'+str(totalcounts[0])+'\t'+str(totalcounts[1])+'\n')
    fw.write('\n')
    fw.write('NumUsers'+'\t'+str(len(uid2counts))+'\n')
    for [uid,counts] in sorted(uid2counts.items(),key=lambda x:-x[1][2]):
        fw.write(uid+'\t'+str(counts[0])+'\t'+str(counts[1])+'\t'+str(counts[2])+'\n')
    fw.close()

##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

def cleantext(text):
    if text == '': return text    
    from nltk.tokenize import sent_tokenize,word_tokenize
    text = ' '+text+' '
    text = text.replace('`',"'").replace('’',"'")
    text = text.replace('&amp;lt;',"&<")
    text = text.replace('&lt;',"<").replace('&lt',"<")
    text = text.replace('&gt;',">").replace('&gt',">")    
    text = text.replace(' :) ',' [smile] ')
    text = text.replace(' :(.',' [sad] .').replace(' :( ',' [sad] ').replace(' :-( ',' [sad] ')
    text = text.replace('*',' ').replace(' ~',' ')
    text = text.replace(' a)'," (a)").replace(' b)'," (b)").replace(' c)'," (c)")
    [numword,text] = cleanbody(text)
    _text = ''
    for t in sent_tokenize(text):
        ws = word_tokenize(t)
        n = len(ws)
        for i in range(n):
            w = ws[i]
            if w == '``' or w == "''" or '_' in w or '~~' in w or '//' in w or 'http' in w: continue
            if w == 'i':
                w = 'I'
            elif w == "'m":
                w = 'am'
            elif w == 'im' or w == 'Im':
                w = 'I am'
            elif w in ['Are','Ah','Do','Did']:
                w = w.lower()
            elif w == "'s":
                if i > 0:
                    prev = ws[i-1].lower()
                    if prev == 'let':
                        w = 'us'
                    elif prev in ['he','she','it','here','there','this','that','how','what','where','who']:
                        w = 'is'
            elif w == "'re":
                w = 'are'
            elif w == "'ve":
                w = 'have'
            elif w == 'ive':
                w = 'I have'
            elif w == "'d":
                w = 'had'
            elif w == "'ll":
                w = 'will'
            elif w == "n't":
                w = 'not'
            elif w == 'ca' and i < n-1 and ws[i+1] == "n't":
                w = 'can'
            elif w == 'wo' and i < n-1 and ws[i+1] == "n't":
                w = 'will'
            _text += ' '+w
    text = _text
    if not text == '': text = text[1:]
    ret = text
    return text

def postagging(text):
    if text == '': return text
    import nltk
    from nltk.tokenize import sent_tokenize
    sentences = []
    sentence = []
    arr = text.split(' ')
    n = len(arr)
    for i in range(n):
        elem = arr[i]
        if elem in ['.','?','!']:
            sentence.append(elem)
            sentences.append(sentence)
            sentence = []
        else:
            sentence.append(elem)
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    text = ''
    for sentence in sentences:
        for (elem,tag) in nltk.pos_tag(nltk.Text(sentence)):
            if elem == '[': tag = '('
            elif elem == ']': tag = ')'
            elif elem == '<': tag = '('
            elif elem == '>': tag = ')'
            text += ' '+elem+':'+tag
    if not text == '': text = text[1:]
    return text

def phrasemining(text):
    if text == '': return text    
    elemtags = []
    for elemtag in text.split(' '):
        pos = elemtag.rfind(':')
        elem = elemtag[:pos]
        tag = elemtag[pos+1:]
        elemtags.append([elem,elem.lower(),tag])
    l = len(elemtags)
    # P1a, P1b, P1c: {DT/PRP+} {RB/JJ+} {NN+} NN | {RB/JJ+} {VB+} | {RB/JJ+} {VB+} {DT/PRP+} {RB/JJ+} {NN+} NN
    # P2: {RB/JJ+} {IN/TO+} PRP {DT+} {RB/JJ+} {VB/NN+} {RB/JJ+}
    # P3: {VB/NN+} PRP {RB/JJ+} {VB/NN+} {RB/JJ+}
    # P4: {PRP/RB/JJ/VB/NN+} IN/TO {PRP/RB/JJ/VB/NN}
    phrases = []    
    for i in range(l):
        tag = elemtags[i][2]
        if tag.startswith('NN') and (i == l-1 or not elemtags[i+1][2].startswith('NN')):
            if i == 0:
                phrase = elemtags[i][1]
                phrases.append(phrase)
            else:
                j = i-1
                while j > 0 and elemtags[j][2].startswith('NN'):
                    j -= 1
                while j > 0 and (elemtags[j][2].startswith('RB') or elemtags[j][2].startswith('JJ')):
                    j -= 1
                j0 = j
                if j0 > 0: j0 += 1
                while j > 0 and (elemtags[j][2].startswith('PRP') or elemtags[j][2].startswith('DT')):
                    j -= 1
                j1 = j
                if j1 > 0: j1 += 1
                hasVB = False
                while j > 0 and elemtags[j][2].startswith('VB'):
                    hasVB = True
                    j -= 1
                while j > 0 and (elemtags[j][2].startswith('RB') or elemtags[j][2].startswith('JJ')):
                    j -= 1
                j2 = j
                if j2 > 0: j2 += 1
                if i+1-j0 >= 2:
                    phrase = ''
                    for k in range(j0,i+1):
                        phrase += '_'+elemtags[k][1]
                    phrase = phrase[1:]
                    phrases.append(phrase)
                    if hasVB:
                        if j1-j2 >= 2:
                            phrase = ''
                            for k in range(j2,j1):
                                phrase += '_'+elemtags[k][1]
                            phrase = phrase[1:]
                            phrases.append(phrase)
                        phrase = ''
                        for k in range(j2,i+1):
                            phrase += '_'+elemtags[k][1]
                        phrase = phrase[1:]
                        phrases.append(phrase)
    for i in range(l):
        tag = elemtags[i][2]
        if tag.startswith('PRP') and i < l-1 and i > 1:
            if elemtags[i-1][2].startswith('IN') or elemtags[i-1][2].startswith('TO'):
                j0 = i-2
                while j0 > 0 and (elemtags[j0][2].startswith('RB') or elemtags[j0][2].startswith('JJ')):
                    j0 -= 1
                if j0 > 0: j0 += 1
                j1 = i+1
                while j1 < l and elemtags[j1][2].startswith('DT'):
                    j1 += 1
                while j1 < l and (elemtags[j1][2].startswith('RB') or elemtags[j1][2].startswith('JJ')):
                    j1 += 1
                while j1 < l and (elemtags[j1][2].startswith('VB') or elemtags[j1][2].startswith('NN')):
                    j1 += 1
                while j1 < l and (elemtags[j1][2].startswith('RB') or elemtags[j1][2].startswith('JJ')):
                    j1 += 1
                if j1 < l-1: j1 -= 1
                if i-j0 >= 2 and j1-i >= 2:
                    phrase = ''
                    for k in range(j0,j1):
                        phrase += '_'+elemtags[k][1]
                    phrase = phrase[1:]
                    phrases.append(phrase)
    for i in range(l):
        tag = elemtags[i][2]
        if tag.startswith('PRP') and i < l-1 and i > 0:
            j0 = i-1
            while j0 > 0 and (elemtags[j0][2].startswith('VB') or elemtags[j0][2].startswith('NN')):
                j0 -= 1
            if j0 > 0: j0 += 1
            j1 = i+1
            while j1 < l and (elemtags[j1][2].startswith('RB') or elemtags[j1][2].startswith('JJ')):
                j1 += 1
            while j1 < l and (elemtags[j1][2].startswith('VB') or elemtags[j1][2].startswith('NN')):
                j1 += 1
            while j1 < l and (elemtags[j1][2].startswith('RB') or elemtags[j1][2].startswith('JJ')):
                j1 += 1
            if i-j0 >= 1 and j1-i >= 2:
                phrase = ''
                for k in range(j0,j1):
                    phrase += '_'+elemtags[k][1]
                phrase = phrase[1:]
                phrases.append(phrase)
    for i in range(l):
        tag = elemtags[i][2]
        if (tag.startswith('IN') or tag.startswith('TO')) and i < l-1 and i > 0:
            j0 = i-1
            while j0 > 0 and (elemtags[j0][2].startswith('PRP') or elemtags[j0][2].startswith('RB') or elemtags[j0][2].startswith('JJ') \
                    or elemtags[j0][2].startswith('VB') or elemtags[j0][2].startswith('NN')):
                j0 -= 1
            if j0 > 0: j0 += 1
            j1 = i+1
            while j1 < l and (elemtags[j1][2].startswith('PRP') or elemtags[j1][2].startswith('RB') or elemtags[j1][2].startswith('JJ') \
                    or elemtags[j1][2].startswith('VB') or elemtags[j1][2].startswith('NN')):
                j1 += 1
            if i-j0 >= 1 and j1-i >= 2:
                phrase = ''
                for k in range(j0,j1):
                    phrase += '_'+elemtags[k][1]
                phrase = phrase[1:]
                phrases.append(phrase)
    for i in range(l):
        elem = elemtags[i][1]
        if 'smile' in elem:
            phrases.append('['+elem+']')
        elif 'sad' in elem:
            phrases.append('['+elem+']')
        elif 'hug' in elem:
            phrases.append('['+elem+']')
        elif elem in ['?','!']:
            phrases.append('['+elem+']')
    text = ''
    for phrase in phrases:
        text += ' '+phrase
    if not text == '': text = text[1:]
    return text

def data_document():
    from nltk.stem import PorterStemmer
    porter = PorterStemmer()
    fw = open('../data/document.tsv','w')
    fw.write('LINKID\tROOT\tWORDS\tPHRASES\tSTEM\n')
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('').split('\t')
        s = arr[0]+'\t'+arr[1]
        text = arr[3]
        text = cleantext(text)
        words = text.split(' ')        
        s += '\t'+text        
        text = postagging(text)
        text = phrasemining(text)
        s += '\t'+text
        text = ''
        for word in words:
            text += ' '+porter.stem(word)
        s += '\t'+text[1:]
        fw.write(s+'\n')
    fr.close()
    fw.close()

def lda_run(NUM_TOPICS_ROOT,NUM_TOPICS_COMMENT):
    NUM_WORDS_PER_TOPIC = 300
    import gensim
    from gensim.corpora import Dictionary
    from gensim.models.ldamodel import LdaModel
    from gensim.utils import simple_preprocess
    from gensim.parsing.preprocessing import STOPWORDS
    import numpy as np
    n = 0
    textsWord,textsPhrase = [],[]
    textsRoot,textsComment = [],[]
    fr = open('../data/document.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        n += 1
        textWord,textPhrase = [],[]
        for token in gensim.utils.simple_preprocess(arr[2]):
            if not token in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                textWord.append(token)
        for token in arr[3].split(' '):
            if token == '': continue
            textPhrase.append(token)
        if arr[1] == 'Y':
            textsWord.append(['Y',textWord])
            textsPhrase.append(['Y',textPhrase])            
            textsRoot.append(textWord)
        else:
            textsWord.append(['N',textWord])
            textsPhrase.append(['N',textPhrase])            
            textsComment.append(textWord)
    fr.close()

    dctRoot = Dictionary(textsRoot)
    corpusRoot = [dctRoot.doc2bow(text) for text in textsRoot]
    ldaRoot = LdaModel(corpus=corpusRoot,num_topics=NUM_TOPICS_ROOT,passes=10,iterations=100) # passes & iterations
    log_perplexity = ldaRoot.log_perplexity(corpusRoot)

    fw = open('lda/lda_topic_terms_root_'+str(NUM_TOPICS_ROOT)+'-word.txt','w')
    fw.write('LOG_PERPLEXITY'+'\t'+str(log_perplexity)+'\n')
    for topicid in range(NUM_TOPICS_ROOT):
        termids = ldaRoot.get_topic_terms(topicid,topn=NUM_WORDS_PER_TOPIC)
        for (termid,score) in termids:
            fw.write(str(topicid)+'\t'+dctRoot[termid]+'\t'+str(score)+'\n')
    fw.close()

    dctComment = Dictionary(textsComment)
    corpusComment = [dctComment.doc2bow(text) for text in textsComment]
    ldaComment = LdaModel(corpus=corpusComment,num_topics=NUM_TOPICS_COMMENT,passes=10,iterations=100) # passes & iterations
    log_perplexity = ldaComment.log_perplexity(corpusComment)

    fw = open('lda/lda_topic_terms_comment_'+str(NUM_TOPICS_COMMENT)+'-word.txt','w')
    fw.write('LOG_PERPLEXITY'+'\t'+str(log_perplexity)+'\n')
    for topicid in range(NUM_TOPICS_COMMENT):
        termids = ldaComment.get_topic_terms(topicid,topn=NUM_WORDS_PER_TOPIC)
        for (termid,score) in termids:
            fw.write(str(topicid)+'\t'+dctComment[termid]+'\t'+str(score)+'\n')
    fw.close()

    topicid2phrase2scoreRoot = [{} for i in range(NUM_TOPICS_ROOT)]
    topicid2phrase2scoreComment = [{} for i in range(NUM_TOPICS_COMMENT)]
    fw = open('lda/lda_doc_topics_'+str(NUM_TOPICS_ROOT)+'_'+str(NUM_TOPICS_COMMENT)+'.txt','w')
    for lineno in range(n):
        textWord = textsWord[lineno]
        textPhrase = textsPhrase[lineno]
        if textWord[0] == 'Y':
            corpus = [dctRoot.doc2bow(textWord[1])]
            docs = ldaRoot.get_document_topics(corpus)
            topic_score = sorted(docs[0],key=lambda x:-x[1])
            s = ''
            for [topicid,score] in docs[0]:
                s += ' '+str(topicid)+':'+str(score)
            fw.write(s[1:]+'\n')
            topicid,score = topic_score[0]
            for phrase in textPhrase[1]:
                if not phrase in topicid2phrase2scoreRoot[topicid]:
                    topicid2phrase2scoreRoot[topicid][phrase] = 0.
                topicid2phrase2scoreRoot[topicid][phrase] += score
        else:
            corpus = [dctComment.doc2bow(textWord[1])]
            docs = ldaComment.get_document_topics(corpus)
            topic_score = sorted(docs[0],key=lambda x:-x[1])
            s = ''
            for [topicid,score] in docs[0]:
                s += ' '+str(topicid)+':'+str(score)
            fw.write(s[1:]+'\n')
            topicid,score = topic_score[0]
            for phrase in textPhrase[1]:
                if not phrase in topicid2phrase2scoreComment[topicid]:
                    topicid2phrase2scoreComment[topicid][phrase] = 0.
                topicid2phrase2scoreComment[topicid][phrase] += score
    fw.close()

    fw = open('lda/lda_topic_terms_root_'+str(NUM_TOPICS_ROOT)+'-phrase.txt','w')
    for topicid in range(NUM_TOPICS_ROOT):
        phrase2score = topicid2phrase2scoreRoot[topicid]
        phrase_score = sorted(phrase2score.items(),key=lambda x:-x[1])
        totalscore = 0.
        for [phrase,score] in phrase_score:
            totalscore += score
        for i in range(min(len(phrase_score),NUM_WORDS_PER_TOPIC)):
            phrase,score = phrase_score[i]
            fw.write(str(topicid)+'\t'+phrase+'\t'+str(1.*score/totalscore)+'\n')
    fw.close()

    fw = open('lda/lda_topic_terms_comment_'+str(NUM_TOPICS_COMMENT)+'-phrase.txt','w')
    for topicid in range(NUM_TOPICS_COMMENT):
        phrase2score = topicid2phrase2scoreComment[topicid]
        phrase_score = sorted(phrase2score.items(),key=lambda x:-x[1])
        totalscore = 0.
        for [phrase,score] in phrase_score:
            totalscore += score
        for i in range(min(len(phrase_score),NUM_WORDS_PER_TOPIC)):
            phrase,score = phrase_score[i]
            fw.write(str(topicid)+'\t'+phrase+'\t'+str(1.*score/totalscore)+'\n')
    fw.close()

def lda_coherence(PAIRS_NUM_TOPICS):
    NUM_DOCS_PER_TOPIC = 20 
    import gensim
    from gensim.corpora import Dictionary
    from gensim.models.ldamodel import LdaModel
    from gensim.utils import simple_preprocess
    from gensim.parsing.preprocessing import STOPWORDS
    from gensim.models.coherencemodel import CoherenceModel

    textsRoot,textsComment = [],[]
    fr = open('../data/document.tsv','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        textWord = []
        for token in gensim.utils.simple_preprocess(arr[2]):
            if not token in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                textWord.append(token)
        if arr[1] == 'Y': 
            textsRoot.append(textWord)
        else:         
            textsComment.append(textWord)
    fr.close()

    dctRoot = Dictionary(textsRoot)
    corpusRoot = [dctRoot.doc2bow(text) for text in textsRoot]
    dctComment = Dictionary(textsComment)
    corpusComment = [dctComment.doc2bow(text) for text in textsComment]

    fw = open('lda/lda_coherence.txt','w')
    fw.write('NUM_TOPICS_ROOT\tNUM_TOPICS_COMMENT\tROOT/COMMENT\tCOHERENCE\n')
    for [NUM_TOPICS_ROOT,NUM_TOPICS_COMMENT] in PAIRS_NUM_TOPICS:
        fr = open('lda/network_lda_'+str(NUM_TOPICS_ROOT)+'_'+str(NUM_TOPICS_COMMENT)+'.txt','r')

        topics = [[] for i in range(NUM_TOPICS_ROOT)]
        for line in fr:
            line = line.strip('\r\n')
            if line.startswith('@C'): break
            if len(line) > 4 and line[:2] == '@P' and line[3] == 'W':
                pos = line.find(',')
                topic = line[pos+1:].split(' ')
                topics[int(line[2])] = topic
        coherence,isvalid = 0.,True
        for topic in topics:
            if len(topic) == 0:
                isvalid = False
                break
        if isvalid:
            cm = CoherenceModel(topics=topics,corpus=corpusRoot,dictionary=dctRoot,coherence='u_mass')
            coherence = cm.get_coherence()
        fw.write(str(NUM_TOPICS_ROOT)+'\t'+str(NUM_TOPICS_COMMENT)+'\t'+'root'+'\t'+str(coherence)+'\n')

        topics = [[] for i in range(NUM_TOPICS_COMMENT)]
        if len(line) > 4 and line[:2] == '@C' and line[3] == 'W':
            pos = line.find(',')
            topic = line[pos+1:].split(' ')
            topics[int(line[2])] = topic
        for line in fr:
            line = line.strip('\r\n')            
            if len(line) > 4 and line[:2] == '@C' and line[3] == 'W':
                pos = line.find(',')
                topic = line[pos+1:].split(' ')
                topics[int(line[2])] = topic
        coherence,isvalid = 0.,True
        for topic in topics:
            if len(topic) == 0:
                isvalid = False
                break
        if isvalid:
            cm = CoherenceModel(topics=topics,corpus=corpusComment,dictionary=dctComment,coherence='u_mass')
            coherence = cm.get_coherence()
        fw.write(str(NUM_TOPICS_ROOT)+'\t'+str(NUM_TOPICS_COMMENT)+'\t'+'comment'+'\t'+str(coherence)+'\n')

        fr.close()
    fw.close()

def lda_network(NUM_TOPICS_ROOT,NUM_TOPICS_COMMENT):
    RATIO_MARGIN = 0.2
    NUM_DOCS_PER_TOPIC = 20
    import numpy as np

    docs = []
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        docs.append([arr[1],arr[2],arr[3]])
    fr.close()

    word2topicid2scoreRoot,phrase2topicid2scoreRoot = {},{}
    word2topicid2scoreComment,phrase2topicid2scoreComment = {},{}
    fr = open('lda/lda_topic_terms_root_'+str(NUM_TOPICS_ROOT)+'-word.txt','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        topicid,term,score = int(arr[0]),arr[1],float(arr[2])
        if not term in word2topicid2scoreRoot:
            word2topicid2scoreRoot[term] = {}
        word2topicid2scoreRoot[term][topicid] = score
    fr.close()
    fr = open('lda/lda_topic_terms_root_'+str(NUM_TOPICS_ROOT)+'-phrase.txt','r')
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        topicid,term,score = int(arr[0]),arr[1],float(arr[2])
        if not term in phrase2topicid2scoreRoot:
            phrase2topicid2scoreRoot[term] = {}
        phrase2topicid2scoreRoot[term][topicid] = score
    fr.close()
    fr = open('lda/lda_topic_terms_comment_'+str(NUM_TOPICS_COMMENT)+'-word.txt','r')
    fr.readline()    
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        topicid,term,score = int(arr[0]),arr[1],float(arr[2])
        if not term in word2topicid2scoreComment:
            word2topicid2scoreComment[term] = {}
        word2topicid2scoreComment[term][topicid] = score
    fr.close()
    fr = open('lda/lda_topic_terms_comment_'+str(NUM_TOPICS_COMMENT)+'-phrase.txt','r')
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        topicid,term,score = int(arr[0]),arr[1],float(arr[2])
        if not term in phrase2topicid2scoreComment:
            phrase2topicid2scoreComment[term] = {}
        phrase2topicid2scoreComment[term][topicid] = score
    fr.close()
    topicid2wordsphrasesRoot = [[{},{}] for i in range(NUM_TOPICS_ROOT)]
    topicid2wordsphrasesComment = [[{},{}] for i in range(NUM_TOPICS_COMMENT)]
    for [term,topicid2score] in word2topicid2scoreRoot.items():
        topicid_score = sorted(topicid2score.items(),key=lambda x:-x[1])
        if len(topicid_score) == 1 or topicid_score[0][1] > topicid_score[1][1]*(1.+RATIO_MARGIN):
            topicid,score = topicid_score[0]
            topicid2wordsphrasesRoot[topicid][0][term] = score
    for [term,topicid2score] in phrase2topicid2scoreRoot.items():
        topicid_score = sorted(topicid2score.items(),key=lambda x:-x[1])
        if len(topicid_score) == 1 or topicid_score[0][1] > topicid_score[1][1]*(1.+RATIO_MARGIN):
            topicid,score = topicid_score[0]
            topicid2wordsphrasesRoot[topicid][1][term] = score
    for [term,topicid2score] in word2topicid2scoreComment.items():
        topicid_score = sorted(topicid2score.items(),key=lambda x:-x[1])
        if len(topicid_score) == 1 or topicid_score[0][1] > topicid_score[1][1]*(1.+RATIO_MARGIN):
            topicid,score = topicid_score[0]
            topicid2wordsphrasesComment[topicid][0][term] = score
    for [term,topicid2score] in phrase2topicid2scoreComment.items():
        topicid_score = sorted(topicid2score.items(),key=lambda x:-x[1])
        if len(topicid_score) == 1 or topicid_score[0][1] > topicid_score[1][1]*(1.+RATIO_MARGIN):
            topicid,score = topicid_score[0]
            topicid2wordsphrasesComment[topicid][1][term] = score

    topicassign = []
    linenoscoreRoot = [[] for i in range(NUM_TOPICS_ROOT)]
    linenoscoreComment = [[] for i in range(NUM_TOPICS_COMMENT)]
    fr = open('lda/lda_doc_topics_'+str(NUM_TOPICS_ROOT)+'_'+str(NUM_TOPICS_COMMENT)+'.txt','r')
    lineno = 0
    for line in fr:
        arr = line.strip('\r\n').split(' ')
        topicid_score = []
        for item in arr:
            _arr = item.split(':')
            topicid = int(_arr[0])
            score = float(_arr[1])
            topicid_score.append([topicid,score])
        topicid_score = sorted(topicid_score,key=lambda x:-x[1])
        if len(topicid_score) == 1 or topicid_score[0][1] > topicid_score[1][1]*(1.+RATIO_MARGIN):
            topicid,score = topicid_score[0]
            topicassign.append(topicid)
            if not docs[lineno][1][0] == '[' and not docs[lineno][1] == 'None':
                if docs[lineno][0] == 'Y':
                    linenoscoreRoot[topicid].append(['{'+docs[lineno][1]+'}: '+docs[lineno][2],score])
                else:
                    linenoscoreComment[topicid].append(['{'+docs[lineno][1]+'}: '+docs[lineno][2],score])
        else:
            topicassign.append(-1)
        lineno += 1
    fr.close()

    fw = open('lda/network_lda_'+str(NUM_TOPICS_ROOT)+'_'+str(NUM_TOPICS_COMMENT)+'.txt','w')

    # Representative words & phrases
    for topicid in range(NUM_TOPICS_ROOT):
        s = ''
        for [term,score] in sorted(topicid2wordsphrasesRoot[topicid][0].items(),key=lambda x:-x[1]):
            s += ' '+term
        fw.write('@P'+str(topicid)+'W,'+s[1:]+'\n')
        s = ''
        for [term,score] in sorted(topicid2wordsphrasesRoot[topicid][1].items(),key=lambda x:-x[1]):
            s += ' '+term
        fw.write('@P'+str(topicid)+'P,'+s[1:]+'\n')
        doc_score = sorted(linenoscoreRoot[topicid],key=lambda x:-x[1])
        for i in range(min(len(doc_score),NUM_DOCS_PER_TOPIC)):
            fw.write('@P'+str(topicid)+'D'+str(i)+','+doc_score[i][0]+'\n')
        fw.write('\n')
    for topicid in range(NUM_TOPICS_COMMENT):
        s = ''
        for [term,score] in sorted(topicid2wordsphrasesComment[topicid][0].items(),key=lambda x:-x[1]):
            s += ' '+term
        fw.write('@C'+str(topicid)+'W,'+s[1:]+'\n')
        s = ''
        for [term,score] in sorted(topicid2wordsphrasesComment[topicid][1].items(),key=lambda x:-x[1]):
            s += ' '+term
        fw.write('@C'+str(topicid)+'P,'+s[1:]+'\n')
        doc_score = sorted(linenoscoreComment[topicid],key=lambda x:-x[1])
        for i in range(min(len(doc_score),NUM_DOCS_PER_TOPIC)):
            fw.write('@C'+str(topicid)+'D'+str(i)+','+doc_score[i][0]+'\n')
        fw.write('\n')

    # Post-comment topic volume/vote networks
    topicRoot2count = [set() for i in range(NUM_TOPICS_ROOT)]
    topicComment2count = [set() for i in range(NUM_TOPICS_COMMENT)]    
    topicRoot2topicComment2num = [[0 for j in range(NUM_TOPICS_COMMENT)] for i in range(NUM_TOPICS_ROOT)]
    topicRoot2topicComment2scoresA = [[[] for j in range(NUM_TOPICS_COMMENT)] for i in range(NUM_TOPICS_ROOT)] # -, 0, +    
    topicRoot2topicComment2scoresB = [[[] for j in range(NUM_TOPICS_COMMENT)] for i in range(NUM_TOPICS_ROOT)] # -, +
    topicRoot2topicComment2scoresC = [[[] for j in range(NUM_TOPICS_COMMENT)] for i in range(NUM_TOPICS_ROOT)] # +
    fr = open('../data/bipartite.csv','r')
    for line in fr:
        arr = line.strip('\r\n').split(',')
        rootid,commentid = int(arr[0]),int(arr[1])
        topicRoot = topicassign[rootid]
        topicComment = topicassign[commentid]
        if topicRoot < 0 or topicComment < 0: continue
        topicRoot2count[topicRoot].add(rootid)
        topicComment2count[topicComment].add(commentid)
        score = int(arr[2])-1
        topicRoot2topicComment2num[topicRoot][topicComment] += 1
        topicRoot2topicComment2scoresA[topicRoot][topicComment].append(score)        
        if not score == 0:
            topicRoot2topicComment2scoresB[topicRoot][topicComment].append(score)
        if score > 0:
            topicRoot2topicComment2scoresC[topicRoot][topicComment].append(score)
    fr.close()

    sumcountRoot = sum([len(x) for x in topicRoot2count])
    sumcountComment = sum([len(x) for x in topicComment2count])
    for i in range(NUM_TOPICS_ROOT):
        num = len(topicRoot2count[i])
        pct = np.round(1.*num/sumcountRoot,4)
        fw.write('%'+'P'+str(i)+','+str(num)+','+str(pct)+'\n')
    for i in range(NUM_TOPICS_COMMENT):
        num = len(topicComment2count[i])
        pct = np.round(1.*num/sumcountComment,4)
        fw.write('%'+'C'+str(i)+','+str(num)+','+str(pct)+'\n')
    fw.write('\n')

    topicRoot2num = [sum(topicRoot2topicComment2num[i]) for i in range(NUM_TOPICS_ROOT)]
    for i in range(NUM_TOPICS_ROOT):
        for j in range(NUM_TOPICS_COMMENT):
            num = topicRoot2topicComment2num[i][j]
            pct = 0.
            if topicRoot2num[i] > 0: pct = np.round(1.*num/topicRoot2num[i],4)
            fw.write('#'+'P'+str(i)+','+'C'+str(j)+','+str(num)+','+str(pct)+'\n')
    fw.write('\n')

    for i in range(NUM_TOPICS_ROOT):
        for j in range(NUM_TOPICS_COMMENT):
            scores = np.array(topicRoot2topicComment2scoresA[i][j])
            avgscore,stdscore,maxscore,minscore = 0,0,0,0
            if len(scores) > 0:
                avgscore = np.mean(scores)
                stdscore = np.std(scores)
                minscore = np.min(scores)
                maxscore = np.max(scores)
            fw.write('$'+'P'+str(i)+','+'C'+str(j)+','+str(np.round(avgscore,4))+','+str(np.round(stdscore,4)) \
                    +','+str(np.round(minscore,4))+','+str(np.round(maxscore,4))+'\n')
    fw.write('\n')
    
    for i in range(NUM_TOPICS_ROOT):
        for j in range(NUM_TOPICS_COMMENT):
            scores = np.array(topicRoot2topicComment2scoresB[i][j])
            avgscore,stdscore,maxscore,minscore = 0,0,0,0
            if len(scores) > 0:
                avgscore = np.mean(scores)
                stdscore = np.std(scores)
                minscore = np.min(scores)
                maxscore = np.max(scores)
            fw.write('!'+'P'+str(i)+','+'C'+str(j)+','+str(np.round(avgscore,4))+','+str(np.round(stdscore,4)) \
                    +','+str(np.round(minscore,4))+','+str(np.round(maxscore,4))+'\n')
    fw.write('\n')
    
    for i in range(NUM_TOPICS_ROOT):
        for j in range(NUM_TOPICS_COMMENT):
            scores = np.array(topicRoot2topicComment2scoresC[i][j])
            avgscore,stdscore,maxscore,minscore = 0,0,0,0
            if len(scores) > 0:
                avgscore = np.mean(scores)
                stdscore = np.std(scores)
                minscore = np.min(scores)
                maxscore = np.max(scores)
            fw.write('+'+'P'+str(i)+','+'C'+str(j)+','+str(np.round(avgscore,4))+','+str(np.round(stdscore,4)) \
                    +','+str(np.round(minscore,4))+','+str(np.round(maxscore,4))+'\n')
    fw.write('\n')

    fw.close()

##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

def analy_choose_topics():
    import numpy as np
    fw = open('lda/analy_choose_topics.txt','w')
    # perplexity & coherence
    topicid2coherenceRoot = {}
    topicid2coherenceComment = {}    
    fr = open('lda/lda_coherence.txt','r')
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        if arr[2] == 'root':
            topicid2coherenceRoot[int(arr[0])] = arr[3]
        else:
            topicid2coherenceComment[int(arr[1])] = arr[3]
    fr.close()
    fw.write('NUM_TOPICS_ROOT\tLOG_PERPLEXITY\tCOHERENCE\n')    
    for i in range(3,11):
        fr = open('lda/lda_topic_terms_root_'+str(i)+'-word.txt','r')
        line = fr.readline()
        arr = line.strip('\r\n').split('\t')
        fw.write(str(i)+'\t'+arr[1]+'\t'+topicid2coherenceRoot[i]+'\n')
        fr.close()
    fw.write('NUM_TOPICS_COMMENT\tLOG_PERPLEXITY\tCOHERENCE\n')
    for i in range(6,21,2):
        fr = open('lda/lda_topic_terms_comment_'+str(i)+'-word.txt','r')
        line = fr.readline()
        arr = line.strip('\r\n').split('\t')
        fw.write(str(i)+'\t'+arr[1]+'\t'+topicid2coherenceComment[i]+'\n')
        fr.close()
    # volume distributions
    numtopics2pctsRoot = {}
    numtopics2pctsComment = {}
    for numtopicRoot in range(3,11):
        numtopicComment = numtopicRoot*2
        numtopics2pctsRoot[numtopicRoot] = []
        numtopics2pctsComment[numtopicComment] = []
        fr = open('lda/network_lda_'+str(numtopicRoot)+'_'+str(numtopicComment)+'.txt','r')
        for line in fr:
            if not line == '' and line[0] == '%':
                arr = line.strip('\r\n').split(',')
                if line[1] == 'P':
                    numtopics2pctsRoot[numtopicRoot].append(float(arr[2]))
                elif line[1] == 'C':
                    numtopics2pctsComment[numtopicComment].append(float(arr[2]))
        fr.close()
    fw.write('ROOT/COMMENT\tNUM_TOPICS\tREL_BIGGEST_GAP\tVOLUME_DISTRIBUTION\n')
    for [numtopics,pcts] in sorted(numtopics2pctsRoot.items(),key=lambda x:x[0]):
        pcts = sorted(pcts,key=lambda x:-x)
        gap = 0.
        for i in range(len(pcts)-1):
            gap = max(gap,pcts[i]-pcts[i+1])
        s = 'root\t'+str(numtopics)+'\t'+str(gap/(1./numtopics))
        for pct in pcts:
            s += '\t'+str(pct)
        fw.write(s+'\n')
    for [numtopics,pcts] in sorted(numtopics2pctsComment.items(),key=lambda x:x[0]):
        pcts = sorted(pcts,key=lambda x:-x)
        gap = 0.
        for i in range(len(pcts)-1):
            gap = max(gap,pcts[i]-pcts[i+1])
        s = 'comment\t'+str(numtopics)+'\t'+str(gap/(1./numtopics))
        for pct in pcts:
            s += '\t'+str(pct)
        fw.write(s+'\n')
    # length of posts/comments
    wordlensRoot,wordlensComment = [],[]
    scoresComment,scoresCommentNonNeg = [],[]
    linkid2numcomment = {}
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        linkid = arr[0]
        words = arr[3].split(' ')
        wordlen = len(words)
        score = int(arr[6])-1
        if arr[1] == 'Y':
            wordlensRoot.append(wordlen)
            linkid2numcomment[linkid] = 0
        else:
            wordlensComment.append(wordlen)
            scoresComment.append(score)
            if score >= 0:
                scoresCommentNonNeg.append(score)
            linkid2numcomment[linkid] += 1
    fr.close()
    wordlensRoot,wordlensComment = np.array(wordlensRoot),np.array(wordlensComment)
    scoresComment,scoresCommentNonNeg = np.array(scoresComment),np.array(scoresCommentNonNeg)
    fw.write('Mean, Median Length of Posts (words)'+'\t'+str(np.mean(wordlensRoot))+'\t'+str(np.median(wordlensRoot))+'\n')
    fw.write('Mean, Median Length of Comments (words)'+'\t'+str(np.mean(wordlensComment))+'\t'+str(np.median(wordlensComment))+'\n')
    fw.write('Average Upvotes per Comment'+'\t'+str(np.mean(scoresComment))+'\tNonnegative: '+str(np.mean(scoresCommentNonNeg))+'\n')
    n0,nums = 0,[]
    for [linkid,numcomment] in linkid2numcomment.items():
        if numcomment == 0: n0 += 1
        nums.append(numcomment)
    n = len(nums)
    nums = np.array(nums)
    fw.write('Average Comments per Post'+'\t'+str(np.mean(nums))+'\n')    
    fw.write('% of Posts with no Comments'+'\t'+str(1.*n0/n)+'\n')
    fw.close()

def correlation_topics():
    import numpy as np
    from scipy.stats import pearsonr
    rootflags = []
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        if arr[1] == 'Y':
            rootflags.append(1)
        else:
            rootflags.append(0)
    fr.close()
    post2topic2score = [[] for i in range(5)]
    comment2topic2score = [[] for i in range(10)]
    fr = open('lda/lda_doc_topics_5_10.txt','r')
    for i in range(len(rootflags)):
        line = fr.readline()
        topicid2score = {}
        for item in line.strip('\r\n').split(' '):
            arr = item.split(':')
            topicid = int(arr[0])
            score = float(arr[1])
            topicid2score[topicid] = score
        if rootflags[i]:
            for topicid in range(5):
                score = 0.
                if topicid in topicid2score:
                    score = topicid2score[topicid]
                post2topic2score[topicid].append(score)
        else:
            for topicid in range(10):
                score = 0.
                if topicid in topicid2score:
                    score = topicid2score[topicid]
                comment2topic2score[topicid].append(score)
    fr.close()
    fw = open('lda/correlation_topics_5_10.txt','w')

    fw.write('Post (5 topics): scipy.stats.pearsonr\n')
    s = ''
    for i in range(5): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(5):
        s = str(i)
        for j in range(5):
            x = pearsonr(post2topic2score[i],post2topic2score[j])
            s += '\t'+str(np.round(x[0],4))
        fw.write(s+'\n')
    fw.write('\n')

    fw.write('Post (5 topics): numpy.corrcoef\n')
    s = ''
    for i in range(5): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(5):
        s = str(i)
        for j in range(5):
            x = np.corrcoef(post2topic2score[i],post2topic2score[j])
            s += '\t'+str(np.round(x[0,1],4))
        fw.write(s+'\n')
    fw.write('\n')

    fw.write('Comment (10 topics): scipy.stats.pearsonr\n')
    s = ''
    for i in range(10): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(10):
        s = str(i)
        for j in range(10):
            x = pearsonr(comment2topic2score[i],comment2topic2score[j])
            s += '\t'+str(np.round(x[0],4))
        fw.write(s+'\n')
    fw.write('\n')

    fw.write('Comment (10 topics): numpy.corrcoef\n')
    s = ''
    for i in range(10): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(10):
        s = str(i)
        for j in range(10):
            x = np.corrcoef(comment2topic2score[i],comment2topic2score[j])
            s += '\t'+str(np.round(x[0,1],4))
        fw.write(s+'\n')
    fw.write('\n')

    lens = []
    scores = []
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        words = arr[3].split(' ')
        wordlen = len(words)
        score = int(arr[6])-1
        if arr[1] == 'N':
            lens.append(wordlen)
            scores.append(score)
    fr.close()
    
    x = pearsonr(lens,scores)
    fw.write('Length vs Upvote: '+str(x)+'\n')

    fw.close()

def similarity_topics():
    import numpy as np
    from scipy.spatial.distance import cosine
    rootflags = []
    fr = open('../data/data.tsv','r')
    fr.readline()
    fr.readline()
    for line in fr:
        arr = line.strip('\r\n').split('\t')
        if arr[1] == 'Y':
            rootflags.append(1)
        else:
            rootflags.append(0)
    fr.close()
    post2topic2score = [[] for i in range(5)]
    comment2topic2score = [[] for i in range(10)]
    fr = open('lda/lda_doc_topics_5_10.txt','r')
    for i in range(len(rootflags)):
        line = fr.readline()
        topicid2score = {}
        for item in line.strip('\r\n').split(' '):
            arr = item.split(':')
            topicid = int(arr[0])
            score = float(arr[1])
            topicid2score[topicid] = score
        if rootflags[i]:
            for topicid in range(5):
                score = 0.
                if topicid in topicid2score:
                    score = topicid2score[topicid]
                post2topic2score[topicid].append(score)
        else:
            for topicid in range(10):
                score = 0.
                if topicid in topicid2score:
                    score = topicid2score[topicid]
                comment2topic2score[topicid].append(score)
    fr.close()
    fw = open('lda/similarity_topics_5_10.txt','w')

    fw.write('Post (5 topics)\n')
    s = ''
    for i in range(5): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(5):
        s = str(i)
        for j in range(5):
            x = 1.-cosine(post2topic2score[i],post2topic2score[j])
            s += '\t'+str(np.round(x,4))
        fw.write(s+'\n')
    fw.write('\n')

    fw.write('Comment (10 topics)\n')
    s = ''
    for i in range(10): s += '\t'+str(i)
    fw.write(s+'\n')
    for i in range(10):
        s = str(i)
        for j in range(10):
            x = 1.-cosine(comment2topic2score[i],comment2topic2score[j])
            s += '\t'+str(np.round(x,4))
        fw.write(s+'\n')
    fw.write('\n')

    fw.close()

if __name__ == '__main__':

#    data_integration()
#    data_bipartite()
#    data_timeliness()
#    data_document()
#    data_collection()

#    pairs_num_topics = []
#    for i in range(3,11):
#        pairs_num_topics.append([i,i*2])
#    for [num_topics_root,num_topics_comment] in pairs_num_topics:
#        lda_run(num_topics_root,num_topics_comment)
#    for [num_topics_root,num_topics_comment] in pairs_num_topics:
#        lda_network(num_topics_root,num_topics_comment)
#    lda_coherence(pairs_num_topics)
#    analy_choose_topics()

#    correlation_topics()
#    similarity_topics()


